capture log close disweights
log using "${logdir}\2_disweights.txt", text replace name(disweights)

*_____________________________________________________________________________________________________________________________________________________
*
**# 0. ENSURE THAT NO MISSING DATA
* (This causes problems for the bootstrapping, so need to ensure done at this stage so that disability prevalence is comparable across measures)
*_____________________________________________________________________________________________________________________________________________________

// Sample selection
drop if survey=="SHARE" & missing(${disvar})	// CHANGE FOR EACH SURVEY! ${disvar} not inc. below , because one of the strengths of these techniques is can be run on waves who have all covariates but not these vars. But need to do this - 0.04% of SHARE with missing data, easiest just to exclude them so not resampled
* Checking age is comparable across country - different waves start in different years (as the cohorts are not always refreshed)
	/* Initial checks
	version 14
	table ${countryyvar} 		if !missing(${pweight}), c(min ragey max ragey)
	*/
	* Using this to decide which countries to include
	keep if ${countrywaves}
	keep if ${agerange}
	/* And checking again
	version 14
	table ${countryyvar} 		if !missing(${pweight}), c(min ragey max ragey)
	tab ${countryyvar} ragey 	if !missing(${pweight}), row nof
	*/
* Dropping item-missing cases
local strippedvars = subinstr("${controls} ${controls2} ${empvar} ${allIRTbinary} ${allIRTordinal} ${allIRTnominal}", "i.", "", .) 	// to get a version without factor variable notation, for most missing vars
egen misscount = rowmiss(`strippedvars')
	* Checks
	log using "${dodir}\Outputs\2_disweights_missingness.txt", text replace name(missingness) 
		dis in red "ITEM MISSINGINESS (excluding ${disvar})"
		misschk `strippedvars'
		tab misscount country
	log close missingness
	* Actually doing the drops & cleaning up
	drop if misscount~=0					
	drop misscount
if ("${if}"~="")		keep ${if}			// So that bootstrap is not resampling people out of sample by definition


// Re-doing weights for the subsample that we're doing disability weights for
levelsof ${countryvar}		// To get number of country-waves, so that can set the mean weight to 1 for each one
foreach i in `r(levels)' {
	dis "Country-wave `i'"
	sum ${pweight} if ${countryvar}==`i'
	replace ${pweight} = ${pweight} /`r(mean)' if ${countryvar}==`i' 
	/**/				}


*_____________________________________________________________________________________________________________________________________________________
*
**# 1. PREDICTED DISABILITY WEIGHTS
* No variation around the coefficients, as want to get sampling uncertainty not coefficient uncertainty
*_____________________________________________________________________________________________________________________________________________________

	
// Getting a working list of control variables (both continuous and factor variables)
* Expanding any factors being used as controls (when created, this was country##education)
if "${disweight_factors}"~=""	{			
	capture drop I*
	local xifac_controls = subinstr("${disweight_factors}", "##", "*", .)		// different notation for interactions
	xi `xifac_controls', prefix(I)
	unab newvars: I* 
/**/ }
local disweight_controls "${disweight_controls} `newvars'"
dis "`disweight_controls'"
/*  A REVISED LIST OF CONTROL VARIABLES IN disweight_controls2:
	The following section enables the controls to be used in the models, and then set to their mean value for PREDICT
	NOTE: setting binary variables to their mean is slightly odd, but works well for ensuring that predicted disability and observed 
		disability have the same prevalence among people with observations on both (which is what I'm aiming for).  
	This is also done for COUNTRY (if this is included in the controls) - to try and get the actual association of impairments with disability,
		you need suitable controls. */
global disweight_controls2 ""
foreach var in `disweight_controls' {
	capture drop `var'_pred
		gen `var'_pred = `var'
		label var `var'_pred	"`var' copy for prediction (actual value for model, mean for prediction)"
	capture drop `var'_mean
		quietly svy: mean `var'_pred  
		matrix working = e(b)
		gen `var'_mean = working[1,1]
		label var `var'_mean "`var' mean value in full sample"
		sum `var'_mean			// a check
	global disweight_controls2 "${disweight_controls2} `var'_pred" 	// This is for the regression model below
/**/ } 
	

// Creating the regression-based weights
eststo comparator:		svy: logit ${disvar} ${allindicators} 	${disweight_controls2}		// Simply by way of comparison to show the negative weights etc below
eststo ${disvar}:		svy: logit ${disvar} ${indicators} 		${disweight_controls2}
	* Setting controls - including COUNTRY! - to be the same value, so not taken into account in prediction
	foreach var in `disweight_controls' {
		replace `var'_pred = `var'_mean 
	/**/ }
	* The prediction
	predict p_predicted                                                               	// The probability of disability
		label var p_predicted "PREDICTED VALUES of ${disvar} from regression weights"
		* Ajudstment so that the mean value of this is the same as ${disvar}
		svy: mean ${disvar} p_predicted if ~missing(${disvar}) & ~missing(p_predicted)		// To show that prevalence is very close (if not quite identical) - accounted for below
		local adjustment = r(table)[1,1] / r(table)[1,2]
		replace p_predicted = p_predicted*`adjustment'
	drop *_pred *_mean 
	capture drop I*
	

* SAVING & OUTPUTTING
esttab ${disvar} comparator using "${dodir}\Outputs\2_dis_weights_predicted_${disvar}.csv", csv replace not p brackets nostar nonum nodepvars b(%4.3f) ///
	wide aic(%6.0fc) bic(%6.0fc) drop(*_pred) /// (+ 0.10 * 0.05 ** 0.01)  nopar 
	addnotes("First column is the specification used in the bootstrap" "Second column is to show negative weights if dont change spec using all vars in IRT spec") 



*_____________________________________________________________________________________________________________________________________________________
*
**# 2. WASHINGTON GROUP-STLYE MEASURE
*_____________________________________________________________________________________________________________________________________________________

egen ${WGvar} = anymatch(${WGindicators}), values(1)
	label var ${WGvar} "Disability (activity-limiting, WG measure)"
egen WGindicatormiss = rowmiss(${WGindicators})
	replace ${WGvar} = .m if WGindicatormiss>0
	capture drop WGindicatormiss 


*_____________________________________________________________________________________________________________________________________________________
*
**# 3. IRT MEASURE
* (Weighted results simply wouldn't work, whether I used SVY: IRT or simply [pw=${pweight}])
*_____________________________________________________________________________________________________________________________________________________


// Hybrid IRT model, with longer list of variables
if "${allIRTordinal}"!=""		local hybridsyntax2 "(2pl ${allIRTbinary}) (grm ${allIRTordinal}) "
if "${allIRTordinal}"==""		local hybridsyntax2 "(2pl ${allIRTbinary}) "
if "${allIRTnominal}"!=""		local hybridsyntax2 "`hybridsyntax2' (nrm ${allIRTnominal})"
dis `"eststo allIRT:	irt hybrid 	`hybridsyntax2'	"'
eststo allIRT:	irt hybrid 	`hybridsyntax2'			// , vce(cluster mergeid)			
	if `e(converged)'==0 	pause Model not converged
	estat report, byparm sort(b)
	matrix allIRTout = r(table)'
	matrix allIRTout = allIRTout[1...,1], allIRTout[1...,5], allIRTout[1...,6]
predict p_allIRT, latent ebmeans		// Empirical Bayes posterior estimate of latent variable
	label var p_allIRT "DISABILITY SCORE using IRT measure with more indicators (hybrid / 2-parameter)"
	

// Hybrid IRT MAIN model
if "${IRTordinal}"!=""		local hybridsyntax1 "(2pl ${IRTbinary}) (grm ${IRTordinal}) "
if "${IRTordinal}"==""		local hybridsyntax1 "(2pl ${IRTbinary})"
if "${IRTnominal}"!=""		local hybridsyntax1 "`hybridsyntax1' (nrm ${IRTnominal})"
dis `"eststo IRT:	irt hybrid 	`hybridsyntax1' "'
eststo IRT:	irt hybrid 	`hybridsyntax1'			// , vce(cluster mergeid)			
	if `e(converged)'==0 	pause Model not converged
	estat report, byparm sort(b) post
	matrix IRTout = r(table)'
	matrix IRTout = IRTout[1...,1], IRTout[1...,5], IRTout[1...,6]
estimates restore IRT
predict p_IRT, latent ebmeans 
	label var p_IRT "DISABILITY SCORE using IRT measure (hybrid / 2-parameter)"
* Don't need to set missingness, because this .do file is designed to run on complete cases only (otherwise the bootstrap will have errors)
	
	
// Outputs
esttab IRT allIRT using "${dodir}\Outputs\2_dis_weights_IRTregs.csv", wide csv replace not p nopar nostar nonum nodepvars b(%5.4f) // (+ 0.10 * 0.05 ** 0.01)  
capture esttab matrix(IRTout) 	  using "${dodir}\Outputs\2_dis_weights_IRT.csv"  , csv replace not p nopar nostar nonum nodepvars b(%5.4f) // (+ 0.10 * 0.05 ** 0.01)
capture esttab matrix(allIRTout)  using "${dodir}\Outputs\2_dis_weights_allIRT.csv", csv replace not p nopar nostar nonum nodepvars b(%5.4f) // (+ 0.10 * 0.05 ** 0.01)
 

// To see factor analysis/PCA versions of this, see 9_checks_and_alternatives.do


*_____________________________________________________________________________________________________________________________________________________
*
**# TURNING PROBAILITIES/SCALES INTO DISABILITY VARS
*_____________________________________________________________________________________________________________________________________________________

// PREDICTED:  Creating the predicted disability variable in the first instance 
// NOTE: this is done each time within the bootstrap, only done here to enable us to create graphs (below) and test out the bootstrap syntax in advance
* Setting an initial random var
capture drop rand1
set seed 1982
gen rand1 = runiform()  // Random variables for sample iteration
	label var rand1	"Random values, set in 4_dis_weights.do (but re-done within the bootstrap)"
* And the variable itself
gen 		${predictedvar} 	= 0 if 						  ~missing(p_predicted)           
	replace ${predictedvar} 	= 1 if p_predicted > rand1  & ~missing(p_predicted)
	label var ${predictedvar} "Disability - regression-based measure using ${disvar}"
	* tab ${disvar} predicted_${disvar}, cell nof		// check


// 	FIXED VARS: Setting % disability to be the same as observed disability	
// 		New version with more precisely matched weighted prevalence than possible using _pctile (which requires using mat2txt)
// 		The svy: tab command is extremely slow, so there's various display flags so that you know where it crashes (if it crashes)
* Firstly, get the proportion of people who report a disability
	svy: prop ${disvar}
	matrix output = e(b)
	global prop_dis			= 100 - (100*output[1,2])

* Latent disability
foreach v in IRT allIRT	{
	capture drop ${disvar}_`v'?			// just in case running this twice, we need to drop the variables we're about to create
	// Ensure that varying precision in globals doesn't cause problems 
		replace p_`v' = round(p_`v', 0.000001)
		recast double p_`v'					// to avoid rounding problems - see help precision
	// Find the cutpoint on the latent-variable score that produces the same level of disability
		* Firstly do _pctile, to get the lower bounded version 
		_pctile p_`v' if !missing(${disvar}) [pw=${pweight}], percentiles(${prop_dis})
			scalar `v'cutoff = `r(r1)'
		gen byte ${disvar}_`v'L = (p_`v' > `v'cutoff  + 0.0000001)				// see help precision for why the + 0.0000001 is added
			label var ${disvar}_`v'L				"Disability - latent measure (just below threshold)"
		* Then find the upper bounded version (from the next-lowest value of p_`v' going in reverse order)
		preserve
			collapse (percent) prev=${disvar} (count) count=${disvar} [pw=${pweight}] if !missing(${disvar}), by(p_`v')
			gen prevvalue = p_`v'[_n-1] if abs(p_`v'-`v'cutoff)<0.0000001		// see help precision to understand why "if p_`v'==pred_cutoff" doesn't quite work here
			sum prevvalue 
			scalar `v'prevvalue = `r(mean)'
		restore
		gen byte ${disvar}_`v'H = (p_`v' > (`v'prevvalue + 0.0000001) )		// see help precision for why the + 0.0000001 is added
			label var ${disvar}_`v'H				"Disability - latent measure (just above threshold)"
	dis _newline(20) "Cutoffs for `v' are" _newline(1) "L=" `v'cutoff _newline(1) "H=" `v'prevvalue
/**/						}

* Predicted disability, fixed version
	capture drop ${disvar}_predicted_fxd?			// just in case running this twice, we need to drop the variables we're about to create
	// Ensure that varying precision in globals doesn't cause problems 
		replace p_predicted = round(p_predicted, 0.000001)
		recast double p_predicted 					// to avoid rounding problems - see help precision
	// Find the cutpoint on the latent-variable score that produces the same level of disability
		* Firstly do _pctile, to get the lower bounded version 
		_pctile p_predicted if !missing(${disvar}) [pw=${pweight}], percentiles(${prop_dis})
			scalar pred_cutoff = `r(r1)'
		gen byte ${disvar}_predicted_fxdL = (p_predicted > pred_cutoff + 0.0000001)				// see help precision for why the + 0.0000001 is added
			label var ${disvar}_predicted_fxdL				"Disability - regression-based measure WITHOUT random element (just below threshold)"
		* Then find the upper bounded version (from the next-lowest value of p_predicted going in reverse order)
		preserve
			collapse (percent) prev=${disvar} (count) count=${disvar} [pw=${pweight}] if !missing(${disvar}), by(p_predicted)
			gen prevvalue = p_predicted[_n-1] if abs(p_predicted-pred_cutoff)<0.0000001		// see help precision to understand why "if p_predicted==pred_cutoff" doesn't quite work here
			sum prevvalue 
				scalar pred_prevvalue = `r(mean)'
		restore
		gen byte ${disvar}_predicted_fxdH = (p_predicted > (pred_prevvalue + 0.0000001) )		// the 0.0000001 is because of precision errors, even with the syntax above...
			label var ${disvar}_predicted_fxdH				"Disability - regression-based measure WITHOUT random element (just above threshold)"
dis _newline(20) "Cutoffs are" _newline(1) "L=" pred_cutoff _newline(1) "H=" pred_prevvalue

	
// DECIDING WHETHER JUST ABOVE OR JUST BELOW THE CUT-OFF IS BEST
svy: mean ${disvar}* ${WGvar} if !missing(${disvar})
pause Check
* Decision as of 11th July 2023, for llsiH - the locals should be the letters 'L' or 'H'
local whichIRT 		= word("${whichversions}", 2)
local whichallIRT 	= word("${whichversions}", 4)
local whichpredfxd 	= word("${whichversions}", 6)
rename ${IRTvar}`whichIRT' 					${IRTvar}
rename ${allIRTvar}`whichIRT' 				${allIRTvar}
rename ${predictedvar}_fxd`whichpredfxd'	${predictedvar}_fxd 
* Quick check
svy: mean ${disvar} ${IRTvar} ${allIRTvar} ${predictedvar}_fxd  ${predictedvar} ${WGvar} if !missing(${disvar})
pause Requires checking by hand that these are the right choices
drop ${IRTvar}? ${allIRTvar}? ${predictedvar}_fxd? 



// For alternative versions (e.g. to match World Disability Report) or checks (e.g. whether regime dis prevalence varies by cutpoint), see 9_checks_and_alternatives.do
			
		
*_____________________________________________________________________________________________________________________________________________________
*
**# GRAPHING THE DISABILITY VARS
*_____________________________________________________________________________________________________________________________________________________

// Graphing cumulative distn by dis classification
// (This used to be done for ${factorvar} separately to predicted values, but now done for fixed vs. random version of predicted var
local disvar = "${disvar}"			// can't have a global followed by a close brace (see following lien)
foreach dismarker in `disvar'_predicted `disvar'_predicted_fxd {
	preserve
		* Manipulating the data
		if "`dismarker'"=="${disvar}_predicted"		{
			local xtitle "Probabilistic disability"
		/**/ }
		if "`dismarker'"=="${disvar}_predicted_fxd"			{
			local xtitle "Fixed cut-off for disability"
		/**/ }
		* Manipulating the data into the right form, with one obvservtion per level of the variable
		gen one = 1 
		collapse (sum) wtfreq=one disfreq=`dismarker' (rawsum) rawfreq=one rawdis=`dismarker' [pw=${pweight}], by(p_predicted)	
		drop if missing(`var')
		sort p_predicted
		gen cumfreq 	= sum(wtfreq) 
		gen cumdis  	= sum(disfreq) 
		gen cumgraph 	= cumfreq - cumdis
		sum wtfreq
		replace cumfreq 	= cumfreq/`r(sum)'
		replace cumdis 		= cumdis /`r(sum)'
		replace cumgraph 	= cumgraph/`r(sum)'
		* Graphing
*expand 2 in 1, gen(marker)
*replace cumfreq = 0 if marker==1
		gen zero = 0
		sort p_predicted
*browse
*pause
		twoway /*(line cumfreq p_predicted)*/	///
			(rarea cumfreq  cumgraph p_predicted, lwidth(none) fcolor(gs2)  legend(label(1 "Counted as disabled"))) 		///
			(rarea cumgraph zero     p_predicted, lwidth(none) fcolor(gs10) legend(label(2 "Counted as non-disabled")) ) , 	///
			yscale(range(0 1)) ylabel(0(0.25)1)  ytitle("Cumulative prevalence", margin(medium)) scheme(sj) ///
			xtitle("Predicted probability of disability", margin(medium)) legend(ring(0) position(6) col(1) width(55)) ///
			name(graph_`dismarker', replace) title("{bf:`xtitle'}")
	restore
/**/ }
graph combine graph_${disvar}_predicted_fxd graph_${disvar}_predicted 	, xsize(8)
graph export "${dodir}\Outputs\0_dis_distn_fixedVrandom.png", replace  width(1200) height(700) 
graph close


// Graphing the scales
// For labelling tips, see https://journals.sagepub.com/doi/pdf/10.1177/1536867X1601600315
preserve
	* Collapsing
	drop if missing(p_predicted)		// I think this is non-response to llsi
	gen one = 1
	local strippedvars = subinstr("${allindicators}", "i.", "", .) 	// to get a version without factor variable notation, for most missing vars
	capture egen dispattern = concat(`strippedvars'), punct("|")	// for collapsing over; doesn't work if approximate via p_predicted or p_IRT
	collapse (mean) p_predicted p_IRT (iqr) check_pred=p_predicted check_IRT=p_IRT (sum) count=one [pw=${pweight}], by(dispattern)
	* Graphing
*local pred_cutoff 	= pred_cutoff
*local IRT_cutoff 	= IRT_cutoff
	local pred_cutoff 	= 0
	local IRT_cutoff 	= .32089901
		scatter p_IRT p_predicted [w=count], xtitle("Predicted disbility scale", margin(medium)) ytitle("Latent disability scale", margin(medium)) 		///
			xline(`IRT_cutoff') yline(`pred_cutoff') legend(off) scheme(econ)  graphregion(color(white) lcolor(white))	mfcolor(none) mlcolor(navy)	///
			/*Labels*/ 	|| scatteri -0.5 0.5 "Predicted but not latent" 2.5 0.15 "Latent" 2.2 0.15 "but not" 1.9 0.15 "predicted", mlabsize(*1.2) msymbol(none) 
	* You can even faff around with trying to get blocks of colour, but tbh it's just too tricky - only partially worked to do gen low = -1 if inrange(p_IRT,`IRT_cutoff',1) || 	gen high = 0 if inrange(p_IRT,`IRT_cutoff',1) || rarea low high p_IRT, sort color(gs14)
	graph export "${dodir}\Outputs\0_dis_predvirt.png", replace  width(1200) height(700) 
	graph close
restore


/* Further exploratory analysis of distributions
	tabstat p_predicted, by(llsiH_predicted_fxd) stat(min mean max) save
	local max_nodis		= r(Stat1)[3,1] + 0.0000001
	local min_dis 		= r(Stat2)[1,1] - 0.0000001
	local justlower		= `max_nodis'-0.05
	capture drop p_predG
	recode p_predicted  (min / `justlower'			=1 "1_low p") 					///
		/**/			(`justlower' / `max_nodis'	=2 "2_just under threshold")	///
		/**/			(`min_dis'/0.7				=3 "3_above threshold")			///
		/**/			(0.7/max					=4 "4_very high p") 			, gen(p_predG)		
	svy: tab country p_predG, row percent format(%3.1f) cellwidth(20) stubwidth(20) nomarginals
	tabstat p_predicted llsiH_predicted_fxd, by(country) stat(mean) format(%3.2f)
	* Emp rates
	recode p_predG (1 2=1), gen(p_predT)
	logit ${empvar}  ib(1).p_predT##i.${countryvar} 	$controls $controls2 ${if} [pw=${pweight}]
		margins r.p_predT@${countryvar}, 		 at(${controlsmeans}) post
*/

*_____________________________________________________________________________________________________________________________________________________
*
**# CLEANING UP
*_____________________________________________________________________________________________________________________________________________________

capture drop WGindicatorcount
drop rand1
order rllsi_WG llsiH_predicted llsiH_predicted_fxd, before(llsiH_IRT)
estimates clear

capture log close disweights